# Sex-Disaggregated Citizenship Statistics: Data Gaps and Why These Matter
# Ashley Mantha-Hollands and Maarten Vink, European University Institute
# Manuscript under review
# January 2026

# Code to reproduce all plots
# Code prepared by MV

# Load packages
library(readxl)
library(dplyr)
library(stringr)
library(tidyverse)
library(eurostat)
library(janitor)
library(patchwork)
library(scales)
library(countrycode)

# Figure 1
# Eurostat naturalisation rate - sex-disaggregated

# download Eurostat data on citizenship acquisition by resident non-citizen population
# naturalisation rate - but only totals! and only back to 2009
migr_acqs0 <- get_eurostat("migr_acqs")
migr_acqs <- migr_acqs0 |>
  filter(indic_mg == "FOR_P_HHAB") |>
 # filter(sex == "T") |>
  select(country = geo, TIME_PERIOD, cit_acq_rate = values, sex) |>
  mutate(year = as.numeric(str_sub(TIME_PERIOD, 1, 4))) |>
  select(country, year, sex, cit_acq_rate) |>
  mutate(cit_acq_rate = round(cit_acq_rate, 1)) 
migr_acqs 

# calculate mean for trendline
migr_acqs_mean <- migr_acqs |> 
  filter(sex != 'T') |>
  group_by(sex) |>
  mutate(ymean = mean(cit_acq_rate)) |>
  ungroup() |>
# calculate mean by country
  group_by(country, sex) |>
  mutate(ctrymean = mean(cit_acq_rate)) |>
  ungroup()

# plot country means 2009 - 2023
Fig_cntrymean_natrat_sex <- migr_acqs_mean |>
  filter(year == 2019, sex != "T") |> # select only one year (note: ctrymean is constant across years per country)
  ggplot(
    aes(
      x = fct_reorder(country, ctrymean),
      y = ctrymean,
      fill = sex
    )
  ) +
  geom_bar(
    stat = "identity",
    position = position_dodge(width = 0.9),
    width = 0.6
  ) +
  geom_text(
    aes(label = round(ctrymean, 1)),
    position = position_dodge(width = 0.9),
    vjust = -0.5,
    size = 3
  ) +
  scale_fill_manual(
    values = c("F" = "grey70", "M" = "grey30"),
    breaks = c("F", "M"),
    labels = c("women", "men")
  ) +
  theme_bw() +
  theme(
    legend.position = c(0.5, 0.98),
    legend.justification = c(0.5, 1),
    legend.direction = "horizontal",
    legend.background = element_blank(),
    legend.key = element_blank()
  ) +
  theme(
    plot.title = element_text(
      size = 18,
      hjust = 0      # left-aligned
    ),
    axis.text.x = element_text(size = 11),
    axis.text.y = element_text(size = 11),
    legend.text = element_text(size = 14),
    plot.caption = element_text(size = 14)
  ) +
  xlab("") +
  ylab("\n") +
  labs(
    fill = NULL,
    title = "Mean citizenship acquisition rate across 33 European countries, 2009–2023",
    caption = ""
  )

Fig_cntrymean_natrat_sex

# save plot
jpeg('Fig.nat_rates_sex_country_mean.jpeg',  width = 20, height = 8, units = 'in', res = 400)
Fig_cntrymean_natrat_sex
dev.off()

## Explore sex-disaggregated stats by naturalisation type in 3 countries
# Code to reproduce Figure 2 in paper

# Germany

# ============================================================================
# MANUAL DOWNLOAD INSTRUCTIONS FOR DESTATIS DATA
# ============================================================================
# 
# 1. For 2000-2023 data:
#    - Visit: https://www-genesis.destatis.de/datenbank/online/statistic/12511/table/12511-0006
#    - Click "Customise" (Werteabruf)
#    - Under "Years": Select ALL years from 2000 to 2023
#    - Click "Transpose" to get years as rows
#    - Click the download icon (↓) and select Excel format
#    - Save as "destatis_2000_2023.xlsx" in your working directory
#
# 2. For 2024 data:
#    - Visit: https://www-genesis.destatis.de/datenbank/online/statistic/12511/table/12511-0007
#    - Click "Transpose" to get proper format
#    - Click the download icon (↓) and select Excel format
#    - Save as "destatis_2024.xlsx" in your working directory
#
# 3. Run this script from the directory containing both files
# ============================================================================


# ============================================================================
# FUNCTION: Clean and process DESTATIS 2000-2023 file
# ============================================================================
clean_destatis_2000_2023 <- function(filepath) {
  
  message("Processing DESTATIS 2000-2023 file...")
  
  # Read raw data without headers
  df_raw <- read_excel(filepath, col_names = FALSE)
  
  # Remove first 4 metadata rows
  df <- df_raw %>% slice(-(1:4))
  
  # Reset row numbers after removal
  rownames(df) <- NULL
  
  # Identify year rows (row 5 onwards in original = row 1 onwards after removal)
  # Years appear in column 1 starting from row 5 (now row 1)
  year_rows <- which(!is.na(df[[1]]) & str_detect(as.character(df[[1]]), "^20[0-2][0-9]$"))
  
  message(sprintf("Found %d years in the data", length(year_rows)))
  
  # Extract header information (rows 1-3 after removal = rows 5-7 in original)
  # Row 2 (original row 6): Country groups
  # Row 3 (original row 7): Sex
  country_groups <- df[2, ]
  sex_labels <- df[3, ]
  
  # Create a list to store data for each year
  all_data <- list()
  
  for (i in seq_along(year_rows)) {
    year_row <- year_rows[i]
    year_value <- as.integer(df[[1]][year_row])
    
    # Determine the range of legal basis rows for this year
    if (i < length(year_rows)) {
      next_year_row <- year_rows[i + 1]
      legal_rows <- (year_row + 1):(next_year_row - 1)
    } else {
      # Last year - go to end of data
      legal_rows <- (year_row + 1):nrow(df)
    }
    
    # Extract legal basis and values for this year
    for (legal_row in legal_rows) {
      legal_basis <- df[[2]][legal_row]  # Legal basis in column 2
      
      # Skip if legal basis is missing or empty
      if (is.na(legal_basis) || legal_basis == "") next
      
      # Extract values for each sex
      # Columns: 3=Male(Total), 4=empty, 5=Female(Total), 6=empty, 7=Total, ...
      # We want columns 3 (Male Total) and 5 (Female Total)
      male_value <- df[[3]][legal_row]
      female_value <- df[[5]][legal_row]
      
      # Convert "-" to NA
      male_value <- ifelse(male_value == "-", NA, male_value)
      female_value <- ifelse(female_value == "-", NA, female_value)
      
      # Add to data list
      all_data <- append(all_data, list(
        data.frame(
          year = year_value,
          Geschlecht = "Männer",
          Rechtsgrundlagen = legal_basis,
          value = as.numeric(male_value),
          stringsAsFactors = FALSE
        )
      ))
      
      all_data <- append(all_data, list(
        data.frame(
          year = year_value,
          Geschlecht = "Frauen",
          Rechtsgrundlagen = legal_basis,
          value = as.numeric(female_value),
          stringsAsFactors = FALSE
        )
      ))
    }
  }
  
  # Combine all data
  df_clean <- bind_rows(all_data)
  
  message(sprintf("Processed data: %d rows", nrow(df_clean)))
  
  return(df_clean)
}

# ============================================================================
# FUNCTION: Clean and process DESTATIS 2024 file
# ============================================================================
clean_destatis_2024 <- function(filepath) {
  
  message("Processing DESTATIS 2024 file...")
  
  # Read raw data without headers
  df_raw <- read_excel(filepath, col_names = FALSE)
  
  # Remove first 4 metadata rows
  df <- df_raw %>% slice(-(1:4))
  
  # Reset row numbers
  rownames(df) <- NULL
  
  # Year is in row 5 (now row 1 after removal)
  year_row <- which(!is.na(df[[1]]) & str_detect(as.character(df[[1]]), "^2024$"))[1]
  year_value <- 2024
  
  message(sprintf("Processing year: %d", year_value))
  
  # Legal basis rows start from year_row + 1
  legal_rows <- (year_row + 1):nrow(df)
  
  # Create a list to store data
  all_data <- list()
  
  for (legal_row in legal_rows) {
    legal_basis <- df[[1]][legal_row]  # Legal basis in column 1 for 2024 file
    
    # Skip if legal basis is missing or empty
    if (is.na(legal_basis) || legal_basis == "") next
    
    # Extract values for each sex
    # Columns: 2=Male(Total), 3=empty, 4=Female(Total), 5=empty, 6=Total, ...
    male_value <- df[[2]][legal_row]
    female_value <- df[[4]][legal_row]
    
    # Convert "-" and "e" to NA
    male_value <- ifelse(male_value %in% c("-", "e"), NA, male_value)
    female_value <- ifelse(female_value %in% c("-", "e"), NA, female_value)
    
    # Add to data list
    all_data <- append(all_data, list(
      data.frame(
        year = year_value,
        Geschlecht = "Männer",
        Rechtsgrundlagen = legal_basis,
        value = as.numeric(male_value),
        stringsAsFactors = FALSE
      )
    ))
    
    all_data <- append(all_data, list(
      data.frame(
        year = year_value,
        Geschlecht = "Frauen",
        Rechtsgrundlagen = legal_basis,
        value = as.numeric(female_value),
        stringsAsFactors = FALSE
      )
    ))
  }
  
  # Combine all data
  df_clean <- bind_rows(all_data)
  
  message(sprintf("Processed data: %d rows", nrow(df_clean)))
  
  return(df_clean)
}

# ============================================================================
# PROCESS AND MERGE DESTATIS DATA
# ============================================================================

# Check if files exist
if (!file.exists("destatis_2000_2023.xlsx") | !file.exists("destatis_2024.xlsx")) {
  message("\n=== DESTATIS FILES NOT FOUND ===")
  message("Please download the files following the instructions at the top of this script.")
  message("Required files:")
  message("  - destatis_2000_2023.xlsx")
  message("  - destatis_2024.xlsx")
  message("")
  message("Current working directory: ", getwd())
#  message("\nFalling back to combined file if available...")
  message("================================\n")
  
  # Fallback to manual file if it exists
#  if (file.exists("DE_2000_2024_comb.xlsx")) {
#    message("Using manually combined file: DE_2000_2024_comb.xlsx\n")
#    df_de0 <- read_excel("DE_2000_2024_comb.xlsx")
#  } else {
#    stop("Neither DESTATIS files nor combined file found. Please download data files.")
#  }
} else {
  # Process both files
  df_2000_2023 <- clean_destatis_2000_2023("destatis_2000_2023.xlsx")
  df_2024 <- clean_destatis_2024("destatis_2024.xlsx")
  
  # Check structure before merging
  message("\n=== Data Structure Check ===")
  message("2000-2023 data:")
  message(sprintf("  Rows: %d, Columns: %d", nrow(df_2000_2023), ncol(df_2000_2023)))
  message(sprintf("  Years: %s to %s", min(df_2000_2023$year, na.rm = TRUE), 
                  max(df_2000_2023$year, na.rm = TRUE)))
  message(sprintf("  Unique legal bases: %d", n_distinct(df_2000_2023$Rechtsgrundlagen)))
  
  message("\n2024 data:")
  message(sprintf("  Rows: %d, Columns: %d", nrow(df_2024), ncol(df_2024)))
  message(sprintf("  Years: %d", unique(df_2024$year)))
  message(sprintf("  Unique legal bases: %d", n_distinct(df_2024$Rechtsgrundlagen)))
  
  # Show sample values for verification
  message("\n=== Sample Data Verification ===")
  message("2024 data - §10 Abs.1 (Male):")
  sample_2024 <- df_2024 %>% 
    filter(str_detect(Rechtsgrundlagen, "S\\.10\\(1\\)"), Geschlecht == "Männer")
  if(nrow(sample_2024) > 0) {
    message(sprintf("  Legal basis: %s", sample_2024$Rechtsgrundlagen[1]))
    message(sprintf("  Value: %s (should be 102020)", sample_2024$value[1]))
  }
  
  message("\n2024 data - §10 Abs.2 (Female):")
  sample_2024_f <- df_2024 %>% 
    filter(str_detect(Rechtsgrundlagen, "S\\.10\\(2\\)"), Geschlecht == "Frauen")
  if(nrow(sample_2024_f) > 0) {
    message(sprintf("  Legal basis: %s", sample_2024_f$Rechtsgrundlagen[1]))
    message(sprintf("  Value: %s (should be 34300)", sample_2024_f$value[1]))
  }
  
  # Merge the datasets
  df_de0 <- bind_rows(df_2000_2023, df_2024) %>%
    arrange(year, Geschlecht, Rechtsgrundlagen)
  
  message("\n=== Merged Data ===")
  message(sprintf("Combined data: %d rows, %d columns", nrow(df_de0), ncol(df_de0)))
  message(sprintf("Years covered: %d to %d", 
                  min(df_de0$year, na.rm = TRUE), 
                  max(df_de0$year, na.rm = TRUE)))
  message(sprintf("Non-missing values: %d", sum(!is.na(df_de0$value))))
}

# Display structure
message("\n=== Final Data Structure ===")
print(str(df_de0))
message("\nFirst few rows:")
print(head(df_de0, 10))
message("\nLast few rows:")
print(tail(df_de0, 10))

# save files to check
library(openxlsx)
write.xlsx(
  list(
    "2000_2023" = df_2000_2023,
    "2024"      = df_2024
  ),
  file = "DE_naturalisation_data.xlsx"
)

# Recode, filter, and select
df_de <- df_de0 %>%
  rename(
    sex_raw = Geschlecht,
    nat_type_raw = Rechtsgrundlagen
  ) %>%
  mutate(
    sex = case_when(
      str_trim(sex_raw) == "Männer"  ~ "Male",
      str_trim(sex_raw) == "Frauen"  ~ "Female",
      TRUE ~ NA_character_
    ),
    # Match legal basis patterns - handle both German and English versions
    naturalisation_type = case_when(
      # §10 Abs.1 - Residence entitlement
      str_detect(nat_type_raw, fixed("§10 Abs.1 StAG")) |
        str_detect(nat_type_raw, fixed("Aufenthalt in Deutschland")) |
        str_detect(nat_type_raw, fixed("S.10(1)")) |
        str_detect(nat_type_raw, fixed("natural. of residents")) ~
        "residence_entitlement", # this includes naturalisations after 3 yrs based on S.10(3),Nat.Act (S.10(1), Nat.Act)
      
      # §10 Abs.2 - Marriage with children
      str_detect(nat_type_raw, fixed("§10 Abs. 2 StAG")) |
        str_detect(nat_type_raw, fixed("Ehegatte, minderjährige Kinder")) |
        str_detect(nat_type_raw, fixed("S.10(2)")) |
        str_detect(nat_type_raw, fixed("derivative nat.,spouse")) ~
        "marriage_children",
      
      # §8 - Residence discretionary
      str_detect(nat_type_raw, fixed("§8 StAG")) |
        str_detect(nat_type_raw, fixed("Niederlassung in Deutschland auf Dauer")) |
        str_detect(nat_type_raw, fixed("S.8")) |
        str_detect(nat_type_raw, fixed("settled perman. in Germ")) ~
        "residence_discretionary",
      
      # §9 - Marriage (spouse is German)
      str_detect(nat_type_raw, fixed("§9 StAG")) |
        str_detect(nat_type_raw, fixed("Ehe- oder Lebenspartner ist Deutscher")) |
        str_detect(nat_type_raw, fixed("S.9")) |
        str_detect(nat_type_raw, fixed("with German spouse")) ~
        "marriage",
      
      TRUE ~ NA_character_
    )
  ) %>%
  filter(year >= 2005) %>%
  filter(!is.na(naturalisation_type)) %>%
  select(year, sex, naturalisation_type, value)

message("\n=== Filtered Data for Analysis ===")
message(sprintf("Rows after filtering (year >= 2005, relevant legal bases): %d", nrow(df_de)))
print(df_de)

# 1) Collapse DE naturalisation types to marriage vs residence
df_de_lr <- df_de %>%
  mutate(
    route = case_when(
      naturalisation_type %in% c("marriage", "marriage_children") ~ "marriage",
      naturalisation_type %in% c("residence_entitlement", "residence_discretionary") ~ "residence",
      TRUE ~ NA_character_
    )
  ) %>%
  filter(!is.na(route)) %>%
  group_by(year, sex, route) %>%
  dplyr::summarise(value = sum(value, na.rm = TRUE), .groups = "drop") %>%
  arrange(year, sex, route)

message("\n=== Marriage vs Residence Aggregation ===")
print(df_de_lr)

# 2) Compute marriage share (marriage / (marriage + residence)) by year and sex
share_de <- df_de_lr %>%
  pivot_wider(names_from = route, values_from = value, values_fill = 0) %>%
  mutate(
    marriage_share = marriage / (marriage + residence)
  ) %>%
  select(year, sex, marriage, residence, marriage_share) %>%
  arrange(year, sex)

message("\n=== Marriage Share Calculation ===")
print(share_de)

# Mean marriage share by sex
mean_by_sex <- share_de %>%
  group_by(sex) %>%
  dplyr::summarise(mean_share = mean(marriage_share, na.rm = TRUE), .groups = "drop")

message("\n=== Mean Marriage Share by Sex ===")
print(mean_by_sex)

# Plot
ggplot(share_de, aes(x = year, y = marriage_share, fill = sex)) +
  geom_col() +
  facet_wrap(~ sex) +
  
  # dashed mean line
  geom_hline(
    data = mean_by_sex,
    aes(yintercept = mean_share),
    linetype = "dashed",
    colour = "grey50"
  ) +
  
  # value label above mean line
  geom_text(
    data = mean_by_sex,
    aes(
      x = -Inf,                      # anchor at left of panel
      y = mean_share,
      label = scales::percent(mean_share, accuracy = 1)
    ),
    hjust = 0.05,                    # nudge left
    vjust = -0.6,                    # nudge above line
    colour = "grey40",
    size = 3,
    inherit.aes = FALSE
  ) +
  
  scale_y_continuous(labels = scales::percent) +
  coord_cartesian(ylim = c(0, 0.5)) +
  scale_fill_manual(
    values = c(
      "Female" = "grey30",
      "Male"   = "grey70"
    )
  ) +
  labs(
    title = "Share of naturalisations in Germany based on marriage, by sex",
    subtitle = "Bars show annual shares; dashed line shows mean share (2005–2024) within each sex",
    caption = "Source: DESTATIS Tables 12511-0006 and 12511-0007; §9 StAG + §10 Abs. 2 StAG (marriage, incl. minor children) vs §10 Abs.1 StAG + §8 StAG (residence)",
    x = "",
    y = "Marriage share of naturalisations\n"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

ggsave(
  filename = "Fig2a_DE_marriage_share_by_sex.png",
  width = 8,
  height = 5,
  dpi = 300
)

# UK data

# Import the Excel file from Home Office
# Read in the file from the replication package or download from url
uk_file <- "citizenship-datasets-sep-2025.xlsx"
if (!file.exists(uk_file)) {
  url <- "https://assets.publishing.service.gov.uk/media/691adb26493305b49ce6e844/citizenship-datasets-sep-2025.xlsx"
  download.file(url, destfile = uk_file, mode = "wb", quiet = TRUE)
}
df_uk0 <- readxl::read_excel(uk_file, sheet = "Data_Cit_D02", skip = 1) |>
  janitor::clean_names()

# check
names(df_uk0)

# rename column names
dim_cols <- c(
  "year", "quarter", "Nationality", "Sex", "Age",
  "Application type group", "Application type"
)
num_cols <- names(df_uk0)[vapply(df_uk0, is.numeric, logical(1))]
measure_cols <- setdiff(num_cols, intersect(num_cols, c("Year", "Quarter")))
# check
names(df_uk0)

# select and summarise
df_uk <- df_uk0 |>
  filter(application_type_group == "Naturalisation", # Application type group == "Naturalisation"
         application_type != "N/A") |> # remove Application type == "N/A"
  mutate(nationality = "Total") |>
  group_by(year, sex, application_type) |> # aggregate across all Nationality to Total, by Year and Sex
  dplyr::summarise(value = sum(grants, na.rm = TRUE), .groups = "drop") |>
  arrange(year, sex, application_type)
df_uk

# prepare plot data
plot_df <- df_uk |>
  filter(
    application_type %in% c(
      "Naturalisation based on marriage",
      "Naturalisation based on residence"
    ),
    sex %in% c("Female", "Male")
  )

#calculate relative preference (marriage share)
share_df <- plot_df |>
  tidyr::pivot_wider(
    names_from = application_type,
    values_from = value
  ) |>
  mutate(
    marriage_share =
      `Naturalisation based on marriage` /
      (`Naturalisation based on marriage` + `Naturalisation based on residence`)
  )

# Mean marriage share by sex (UK)
mean_uk_by_sex <- share_df %>%
  group_by(sex) %>%
  dplyr::summarise(mean_share = mean(marriage_share, na.rm = TRUE), .groups = "drop")

# plot
ggplot(share_df, aes(x = year, y = marriage_share, fill = sex)) +
  geom_col() +
  facet_wrap(~ sex) +
  
  # dashed mean line
  geom_hline(
    data = mean_uk_by_sex,
    aes(yintercept = mean_share),
    linetype = "dashed",
    colour = "grey50"
  ) +
  
  # value label above mean line
  geom_text(
    data = mean_uk_by_sex,
    aes(
      x = -Inf,
      y = mean_share,
      label = scales::percent(mean_share, accuracy = 1)
    ),
    hjust = 0,
    vjust = -0.6,
    colour = "grey40",
    size = 3
  ) +
  
  scale_y_continuous(labels = scales::percent) +
  coord_cartesian(ylim = c(0, 0.5)) +
  scale_fill_manual(
    values = c(
      "Female" = "grey30",
      "Male"   = "grey70"
    )
  ) +
  labs(
    title = "Share of naturalisation grants in the UK based on marriage, by sex",
    subtitle = "Bars show annual shares; dashed line shows mean share within each sex",
    caption = "Home Office, 2025, Dat_Cit_D02, Naturalisation based on residence and marriage",
    x = "",
    y = "Marriage share of grants\n"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

ggsave(
  filename = "Fig2c_UK_marriage_share_by_sex.png",
  width = 8,
  height = 5,
  dpi = 300
)

# Switzerland

# Data downloaded from https://www.bfs.admin.ch/bfs/en/home/statistics/catalogues-databases.assetdetail.36074743.html
library(BFS)
library(pxR)
library(dplyr)
library(stringr)

# Download
asset_path <- bfs_download_asset(
  number_asset = "36074743",
  destfile = "bfs_datafile.px"
)

# Read
px <- read.px(asset_path)
df <- as.data.frame(px)

# Filter + select columns
df_filtered <- df %>%
  filter(Staatsangehörigkeit.ehemalig == "Staatsangehörigkeit ehemalig - Total") %>%
  filter(Geschlecht != "Geschlecht - Total") %>%
  mutate(age = as.numeric(str_extract(Alter, "\\d+"))) %>%
  filter(!is.na(age), age >= 18) %>%
  filter(Kanton == "Schweiz") %>%
  filter(Art.des.Erwerbs %in% c("Ordentliche Einbürgerung", "Erleichterte Einbürgerung")) %>%
  select(year = Jahr,
         sex  = Geschlecht,
         application_type = Art.des.Erwerbs,
         value) %>%
  mutate(
    sex = recode(sex, "Mann" = "Male", "Frau" = "Female"),
    application_type = recode(application_type,
                              "Ordentliche Einbürgerung" = "Ordinary naturalisation",
                              "Erleichterte Einbürgerung" = "Simplified naturalisation")
  )

# Aggregate
df_ch <- df_filtered %>%
  #mutate(value = as.numeric(as.character(value))) %>%  # <- crucial
  group_by(year, sex, application_type) %>%
  dplyr::summarise(value = sum(value, na.rm = TRUE)) %>%
  ungroup()
df_ch #56 obs of 4 vars

#calculate relative preference (marriage share)
share_ch <- df_ch |>
  tidyr::pivot_wider(
    names_from = application_type,
    values_from = value
  ) |>
  mutate(
    marriage_share =
      `Simplified naturalisation` /
      (`Simplified naturalisation` + `Ordinary naturalisation`)
  )

# Mean marriage share by sex (UK)
mean_ch_by_sex <- share_ch %>%
  group_by(sex) %>%
  dplyr::summarise(mean_share = mean(marriage_share, na.rm = TRUE), .groups = "drop")

# plot CH
ggplot(share_ch, aes(x = year, y = marriage_share, fill = sex)) +
  geom_col() +
  facet_wrap(~ sex) +
  
  # dashed mean line
  geom_hline(
    data = mean_ch_by_sex,
    aes(yintercept = mean_share),
    linetype = "dashed",
    colour = "grey50"
  ) +
  
  # value label above mean line
  geom_text(
    data = mean_ch_by_sex,
    aes(
      x = -Inf,
      y = mean_share,
      label = scales::percent(mean_share, accuracy = 1)
    ),
    hjust = 0,
    vjust = -0.6,
    colour = "grey40",
    size = 3
  ) +
  
  scale_y_continuous(labels = scales::percent) +
  coord_cartesian(ylim = c(0, 0.5)) +
  scale_fill_manual(
    values = c(
      "Female" = "grey30",
      "Male"   = "grey70"
    )
  ) +
  labs(
    title = "Share of naturalisation grants in Switzerland based on marriage, by sex",
    subtitle = "Bars show annual shares; dashed line shows mean share within each sex",
    caption = "Federal Statistical Office (Switzerland) 2025, Naturalisation based on residence and marriage",
    x = "",
    y = "Marriage share of grants\n"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

ggsave(
  filename = "Fig2b_CH_marriage_share_by_sex.png",
  width = 8,
  height = 5,
  dpi = 300
)

# Figure 2: combi plot

# --- add country labels (assumes you have share_de and share_df already) ---
de_plot <- share_de %>% mutate(country = "Germany*") |> select(year, sex, marriage_share, country)
ch_plot <- share_ch %>% mutate(country = "Switzerland") |> select(year, sex, marriage_share, country) |>
  mutate(year = as.numeric(as.character(year)))
uk_plot <- share_df %>% mutate(country = "UK") |> select(year, sex, marriage_share, country)

both <- bind_rows(de_plot, ch_plot, uk_plot)

# --- mean lines by country x sex ---
means <- both %>%
  group_by(country, sex) %>%
  dplyr::summarise(mean_share = mean(marriage_share, na.rm = TRUE), .groups = "drop")

# --- combined faceted plot ---
ggplot(both, aes(x = year, y = marriage_share, fill = sex)) +
  geom_col() +
  facet_grid(country ~ sex) +
  
  # dashed mean line
  geom_hline(
    data = means,
    aes(yintercept = mean_share),
    linetype = "dashed",
    colour = "grey50"
  ) +
  
  # label above mean line (slightly left)
  geom_text(
    data = means,
    aes(
      x = -Inf,
      y = mean_share,
      label = scales::percent(mean_share, accuracy = 1)
    ),
    hjust = 0,
    vjust = -0.6,
    colour = "grey40",
    size = 3
  ) +
  
  scale_y_continuous(labels = scales::percent) +
  coord_cartesian(ylim = c(0, 0.5)) +
  scale_fill_manual(
    values = c(
      "female" = "grey30",
      "male"   = "grey70",
      "Female" = "grey30",
      "Male"   = "grey70"
    )
  ) +
  labs(
    title = "Share of naturalisations based on marriage, disaggregated by sex",
    subtitle = "Bars show annual shares; dashed line shows mean share within each sex",
    caption = "*Marriage-based statistics for Germany incl spousal transfer (§9 StAG) as well as co-naturalisation by spouses and minor children (§10 Abs.2 StAG)",
#    caption = "*Marriage-based statistics for Germany incl spousal transfer (§9 StAG, Ehe- oder Lebenspartner ist Deutscher) and co-naturalisation by spouses and minor children (§10 Abs.2 StAG, Ehegatte, minderjährige Kinder)",
    x = "",
    y = "Marriage share of naturalisation\n"
  ) +
  theme_minimal() +
  theme(
    legend.position = "none",
    strip.text.y = element_text(face = "bold", size = 16),  # Germany/UK labels
    strip.text.x = element_text(face = "bold")   # sex labels
  )

ggsave(
  filename = "Fig2_UK_CH_DE_marriage_share_by_sex.png",
  width = 10,
  height = 8,
  dpi = 300
)

### END ###

